import nltk
import sqlite3
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
__author__ = 'panagiotis'
from models import ColumnSelector, UniFeatureSelector
from models import DocVectorizer, SentimentTransformer, MultiDocVectorizer
from models import SpellingTransformer, RemoveContractions, NegationTransformer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score


vectorizer = Pipeline([('features', FeatureUnion([
    ('title', Pipeline([('extract', ColumnSelector(p=0)),
                        ('preprocess', Pipeline([('contract', RemoveContractions(spell_check=False, output="tokens")),
                                                 ('spelling', SpellingTransformer(tokenize=False, output="tokens")),
                                                 ('negations', NegationTransformer(tokenize=False, output="tokens"))])),
                        ('vectorize', FeatureUnion([
                            ('bag-of-words', Pipeline([("tfidf", TfidfVectorizer(tokenizer=lambda x: x, lowercase=False,
                                                                                 stop_words=nltk.corpus.stopwords.words('english'),
                                                                                 ngram_range=(1, 1), min_df=1, max_df=0.5,
                                                                                 binary=True, use_idf=False, norm="l1")),
                                                       ("select", UniFeatureSelector(2000))])),
                            ('doc2vec', Pipeline([("aspects", MultiDocVectorizer(n_features=(38, 55, 65, 110), tokenize=False, lemmatize=False)),
                                                  ("select", UniFeatureSelector(150))])),
                            ('sentiment', SentimentTransformer(vectorizer="count", tokenize=False, lemmatize=True))
                        ]))])),
    ('body', Pipeline([('extract', ColumnSelector(p=1)),
                       ('preprocess', Pipeline([('contract', RemoveContractions(spell_check=False, output="tokens")),
                                                ('spelling', SpellingTransformer(tokenize=False, output="tokens")),
                                                ('negations', NegationTransformer(tokenize=False, output="tokens"))])),
                       ('vectorize', FeatureUnion([
                           ('bag-of-words', Pipeline([("features", TfidfVectorizer(tokenizer=lambda x: x, lowercase=False,
                                                                                   stop_words=nltk.corpus.stopwords.words('english'),
                                                                                   ngram_range=(1, 2), min_df=1, max_df=0.5,
                                                                                   binary=True, use_idf=True, norm="l2")),
                                                      ("select", UniFeatureSelector(3500))])),
                           ('doc2vec', Pipeline([("aspects", MultiDocVectorizer(n_features=(50, 70, 100, 105), tokenize=False, lemmatize=True)),
                                                 ("select", UniFeatureSelector(250))])),
                           ('sentiment', SentimentTransformer(vectorizer="tf", tokenize=False, lemmatize=True))
                       ]))]))])), ])


# load review data
database_file = "Hotels_g189413_Crete.db"
database_path = "/home/panagiotis/Projects/Thesis/datasets/"
# database_path = "/home/pstalidis/Projects/Thesis/datasets/"
conn = sqlite3.connect(database_path + database_file)
source = conn.cursor()

data = [(review_title, review_text, review_rating)
        for (review_id, hotel_id, hotel_star, hotel_rating,
             average_clean_score, average_service_score, average_location_score,
             average_room_score, average_sleep_score, average_value_score,
             review_title, review_text, review_rating,
             review_clean_score, review_service_score, review_location_score,
             review_room_score, review_sleep_score, review_value_score)
        in source.execute("SELECT * FROM reviews")]

xtr1, xts1 = train_test_split([r for r in data if r[2] == 1], train_size=100, test_size=100, random_state=0)
xtr2, xts2 = train_test_split([r for r in data if r[2] == 2], train_size=100, test_size=100, random_state=0)
xtr3, xts3 = train_test_split([r for r in data if r[2] == 3], train_size=100, test_size=100, random_state=0)
xtr4, xts4 = train_test_split([r for r in data if r[2] == 4], train_size=100, test_size=100, random_state=0)
xtr5, xts5 = train_test_split([r for r in data if r[2] == 5], train_size=100, test_size=100, random_state=0)

train_data = shuffle(xtr1 + xtr2 + xtr3 + xtr4 + xtr5, random_state=0)
test_data = shuffle(xts1 + xts2 + xts3 + xts4 + xts5, random_state=0)

train_titles, train_review, train_target = zip(*train_data)
test_titles, test_review, test_target = zip(*test_data)

del xtr1, xtr2, xtr3, xtr4, xtr5, xts1, xts2, xts3, xts4, xts5

train_data = zip(train_titles, train_review)
test_data = zip(test_titles, test_review)

train_target = [float(r) for r in train_target]
test_target = [float(r) for r in test_target]

del train_titles, train_review, test_titles, test_review, data

training_vectors = vectorizer.fit_transform(train_data, train_target)
testing_vectors = vectorizer.transform(test_data)

vectorizer = None

classifier = LogisticRegression()
classifier.fit(training_vectors, train_target)
results = classifier.predict(testing_vectors)

print "precision", precision_score(test_target, results)
print "recall", recall_score(test_target, results)
print "f-measure", f1_score(test_target, results)
print confusion_matrix(test_target, results)

# parameters = {
#     "features__title__vectorize__bag-of-words__select__max_features": (2000,),
#
#
#
# }
#
# Pipeline([
#     ("sentiments", LinearSVC()),
#     ('select', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
# ])

#
# test_target = [1,0,0,5,0,1,0,1,0,1,0,1]
# results = [1,0,0,1,0,1,1,0,5,1,0,1]
# print roc_auc_score(test_target, results)

